
*This do-File combines the vehicle price and characteristic data from EUROTAX with the characteristic data from TARGA and computes weighted average characteristics for the vehicle options based on Swiss-wide registration data!

use "$root/Data/Original/type_model_make.dta", clear

tab TYPSecFuel

*Only keep recent models that were imported in the years we are actually analyzing.
local dropdate=monthly("2015m12","YM")
drop if ImpEnd<`dropdate'


*Standardize the brand names to ensure better match quality between the different datasets:
replace MAKName=lower(MAKName)
replace MAKName=subinstr(MAKName,"alfa romeo","alfa-romeo",.)
replace MAKName=subinstr(MAKName,"ds automobiles","ds",.)
replace MAKName=subinstr(MAKName,"bmw-alpina","alpina",.)
replace MAKName=subinstr(MAKName,"land rover","land-rover",.)
replace MAKName=subinstr(MAKName,"mercedes-benz","mercedes",.)
replace MAKName=subinstr(MAKName,"aston martin","aston-martin",.)
replace MAKName=subinstr(MAKName,"ssang yong","ssangyong",.)


drop TYPName2
replace TYPName=lower(TYPName)
replace TYPName=trim(TYPName)
replace TYPName=subinstr(TYPName,"é","e",.)
replace TYPName=subinstr(TYPName,"i miev","i-miev",.)

gen marke_typ=MAKName+" "+TYPName

split TYPName


*Generate the Treibstoffcode variable which is a numeric coding for fuel type consistent with the TARGA (and registration) database:
gen treibstoffcode=""
replace treibstoffcode="B" if TYPFuel==100001 | TYPFuel==100002
replace treibstoffcode="C" if TYPFuel==100013
replace treibstoffcode="C" if TYPFuel==100001 & TYPSecKW>0
replace treibstoffcode="D" if TYPFuel==100003
replace treibstoffcode="E" if TYPFuel==100004
replace treibstoffcode="F" if TYPFuel==100014
replace treibstoffcode="F" if TYPFuel==100003 & TYPSecKW>0

**Check if there are cars with the description hybrid in the name that are not coded as hybrids: 

forvalues i=1/7 {
tab treibstoffcode if TYPName`i'=="hybrid"

}

forvalues i=1/7 {
replace treibstoffcode="C" if TYPName`i'=="hybrid" & treibstoffcode=="B"

}

forvalues i=1/7 {
replace treibstoffcode="F" if TYPName`i'=="hybrid" & treibstoffcode=="D"

}

replace TYPName1=TYPName1+TYPName2 if MAKName=="lexus"

keep MAKName TYPName* car_ID marke_typ treibstoffcode


save "$root/Data/Original/marke_ID_2015.dta", replace



*First we import the dataset of all car registrations in recent years:

use "$root/Data/Original/NEUZU_CH.dta", clear

*Drop the registrations from after 2019:
drop if year_reg>2019

*Add the TARGA-information:
merge m:1 tgcode using  "$root/Data/Original/TARGA.dta"

keep if _merge==3

drop _merge

encode energieeffizienzkategorie, g(env_cat) label(env_cat)

*Generate a copy of the original data:
frame put _all, into(car_reg_TARGA)


*Prepare for merge with price data:

split marke_typ, p(" ")

rename marke_typ1 MAKName
rename marke_typ2 TYPName1
rename marke_typ3 TYPName2
rename marke_typ4 TYPName3
rename marke_typ5 TYPName4
rename marke_typ6 TYPName5

*Collapse to prepare for merge with price-data:
collapse (firstnm) marke_typ MAKName TYPName* treibstoffcode, by(tgcode)


egen id=group(tgcode)

parallel setclusters 4

parallel: reclink MAKName marke_typ TYPName* treibstoffcode using "$root/Data/Original/marke_ID_2015.dta",idmaster(id) idusing(car_ID) gen(score) wmatch(30 10 20 10 4 3 2 1 1 10) required(MAKName treibstoffcode)



tab _merge

**Inspect the missings - most have the problem that the TYPName are missspelled. Maybe try reclink via TYPName constructed from the registration data and not from TARGA. 
** Additionaly check for the treibstoffcode mismatches

tab treibstoffcode if _merge==1

*What do the different codes mean?

*Next we recode the treibstoffcodes of the unmatched observations. 
frame put _all, into(unmatched)

frame change unmatched
keep if _merge==1
keep id tgcode  marke_typ treibstoffcode

*Change some potential reasons for mismatches:
replace marke_typ=lower(marke_typ)
replace marke_typ=trim(marke_typ)
replace marke_typ=subinstr(marke_typ,"volkswagen","vw",.)
replace marke_typ=subinstr(marke_typ,"mercedes benz","mercedes",.)
replace marke_typ=subinstr(marke_typ,"mercedes-benz","mercedes",.)
replace marke_typ=subinstr(marke_typ,"mercedes-amg","mercedes amg",.)
replace marke_typ=subinstr(marke_typ,"alfa romeo","alfa-romeo",.)
replace marke_typ=subinstr(marke_typ,"bmw alpina","alpina",.)
replace marke_typ=subinstr(marke_typ,"bmw-alpina","alpina",.)
replace marke_typ=subinstr(marke_typ,"land rover","land-rover",.)
replace marke_typ=subinstr(marke_typ,"landrover","land-rover",.)
replace marke_typ=subinstr(marke_typ,"rolls royce","rolls-royce",.)
replace marke_typ=subinstr(marke_typ,"aston martin","aston-martin",.)
replace marke_typ=subinstr(marke_typ,"ë","e",.)
replace marke_typ=subinstr(marke_typ,"quattro audi","audi",.)
replace marke_typ=subinstr(marke_typ,"range rover","land-rover",.)
replace marke_typ=subinstr(marke_typ,"alfa-romeo alfa-romeo","alfa-romeo",.)
replace marke_typ=subinstr(marke_typ,"alfa-romeo alfa","alfa-romeo",.)
replace marke_typ=subinstr(marke_typ,")","",.)
replace marke_typ=subinstr(marke_typ,"(","",.)
replace marke_typ=subinstr(marke_typ,"qu.","quattro",.)
replace marke_typ=subinstr(marke_typ,"-cng-technik","",.)
replace marke_typ=subinstr(marke_typ,"-dangel","",.)
replace marke_typ=subinstr(marke_typ,"stelvio","stelvio ",.)
replace marke_typ=subinstr(marke_typ,"giulia","giulia ",.)
replace marke_typ=subinstr(marke_typ,"discovery","discovery ",.)
replace marke_typ=subinstr(marke_typ,"discov.","discovery ",.)
replace marke_typ=subinstr(marke_typ,"disco.","discovery ",.)
replace marke_typ=subinstr(marke_typ,"octavia","octavia ",.)
replace marke_typ=subinstr(marke_typ,"hev"," hev",.)
replace marke_typ=subinstr(marke_typ,"spacetourer","spacetourer ",.)
replace marke_typ=subinstr(marke_typ,"rangerover","rangerover ",.)
replace marke_typ=subinstr(marke_typ,"grandc-max","grandc-max ",.)
replace marke_typ=subinstr(marke_typ,"vito","vito ",.)
replace marke_typ=subinstr(marke_typ,"grandcherokee","gr.cherokee ",.)
replace marke_typ=subinstr(marke_typ,"baleno","baleno ",.)
replace marke_typ=subinstr(marke_typ, "grand vitara","vitara",.)
replace marke_typ=subinstr(marke_typ, "gr. vitara","vitara",.)
replace marke_typ=subinstr(marke_typ, "grd vitara","vitara",.)
replace marke_typ=subinstr(marke_typ, "grd.vitara","vitara",.)
replace marke_typ=subinstr(marke_typ, "rav4","rav-4",.)

split marke_typ, p(" ")
rename marke_typ1 MAKName
rename marke_typ2 TYPName1
rename marke_typ3 TYPName2
rename marke_typ4 TYPName3
rename marke_typ5 TYPName4
rename marke_typ6 TYPName5

forvalues i=1/5{

replace TYPName`i'=trim(TYPName`i')

}

replace treibstoffcode="C" if treibstoffcode=="R" 
replace treibstoffcode="C" if treibstoffcode=="F" 




parallel setclusters 2
parallel: reclink MAKName marke_typ TYPName* treibstoffcode using "$root/Data/Original/marke_ID_2015.dta",idmaster(id) idusing(car_ID) gen(score) wmatch(30 10 20 10 4 3 2 10) required(MAKName treibstoffcode)

tab _merge 

drop if _merge==1
drop _merge


frame change default

frameappend unmatched, drop

drop if _merge==1

drop _merge


merge m:1 tgcode using "$root/Data/Original/TARGA.dta", update

keep if _merge!=2

drop _merge

preserve
collapse (max)score, by(tgcode)
tempfile max_score
save `max_score', replace

restore
merge m:1 tgcode score using `max_score'
keep if _merge==3

drop if score<0.70

duplicates drop tgcode, force

keep tgcode UMAKName UTYPName1 UTYPName2 treibstoffcode car_ID

** Add the matched car_ID variable to the frame of the TARGA-Panel: 

frame change car_reg_TARGA

frlink m:1 tgcode, frame(default) gen(car_ID_link)
drop treibstoffcode

frget UMAKName UTYPName1 UTYPName2 treibstoffcode car_ID, from(car_ID_link)

** Only keep relevant sample and merge the price data as well. 
keep tgcode year_reg UMAKName UTYPName1 UTYPName2 freq_reg reg_BE treibstoffcode car_size car_height et_verbrauch et_co2 el_verbrauch env_cat car_ID

drop if missing(car_ID)

** Create the price averages by year in order to match the year and car_ID: 
frame create prices
frame change prices

use "$root/Data/original/Preisdaten/carprices.dta", clear
drop if time>ImpEnd

local dropdate=monthly("2015m12","YM")
drop if time<`dropdate'

gen td=dofm(time)
gen year_reg=yofd(td)

collapse (mean) PRHnewprice, by(car_ID year_reg)

frame change car_reg_TARGA
frlink m:1 car_ID year_reg, frame(prices) gen(price_link)
frget PRHnewprice, from(price_link)

*The models that could not be merged are cars that were no longer imported after 2016. Hence, they should not be considered an option for a household purchasing a new car.
drop if _merge==2
drop _merge

rename UMAKName MAKName
rename UTYPName1 TYPName1
rename UTYPName2 TYPName2


*merge m:1 car_ID using "$root\Data\original\Preisdaten\type_model_make.dta", keepusing(TYPSeg1 TYPKW)
merge m:1 car_ID using "$root/Data/original/Preisdaten/type_model_make.dta", keepusing(TYPSeg1 TYPKW TYPWeight)

tab _merge
keep if _merge==3
drop _merge

rename TYPWeight weight

*Generate some of the variables necessary:

replace TYPName1=TYPName1 + " " + TYPName2 if MAKName=="tesla"


gen drivetype=.
replace drivetype=1 if treibstoffcode=="B"
replace drivetype=2 if treibstoffcode=="D"
replace drivetype=4 if treibstoffcode=="C" | treibstoffcode=="F"
replace drivetype=3 if treibstoffcode=="E"

label define treibstoffcode 1 "gasoline" 2 "diesel" 3 "electric" 4 "hybrid"
drop treibstoffcode

egen car_option=group(MAKName TYPName1 drivetype)
egen total_reg_ann=sum(freq_reg), by(car_option year_reg)
egen total_reg=sum(freq_reg), by(car_option)
egen BE_reg=sum(reg_BE), by(car_option)

egen car_type=group(TYPSeg1)

recode car_type (1=2) (3=7) (6=9)

tab car_type, g(car_cat_FE)


collapse (mean) car_size car_height et_verbrauch et_co2 el_verbrauch env_cat PRHnewprice weight TYPKW car_cat_FE* (firstnm) MAKName TYPName1 drivetype total_reg BE_reg [fw=freq_reg], by(car_option year_reg)

*Now round the variables and reassign some of the value labels. 
replace env_cat=round(env_cat)

*Make sure that only one car_cat_FE is one: 

egen max_cat=rowmax(car_cat_FE*)

foreach var of varlist car_cat_FE1 car_cat_FE2 car_cat_FE3 car_cat_FE4 car_cat_FE5 car_cat_FE6 car_cat_FE7 car_cat_FE8 {
	
	replace `var'=0 if `var'!=max_cat
	replace `var'=1 if `var' == max_cat
	
}

drop max_cat

*Drop some of the missing variables:
drop if missing(et_verbrauch)



save "$root/Data/Produced/car_options.dta", replace

